ML Analysis of LGG Samples from Liquid Biopsy¶

Author: Shehbeel Arif¶

Preclinical Laboratory Research Unit¶

The Center for Data Driven Discovery in Biomedicine (D3b)¶

Children's Hospital of Philadelphia¶

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from umap.umap_ import UMAP
import plotly.express as px

# PCA
from sklearn.decomposition import PCA
# Library for t-SNE projections
from sklearn.manifold import TSNE
In [2]:
meta = pd.read_csv('lgg_lb_meta.csv')
meta = meta.set_index(['SDG_ID'])
#meta
In [3]:
df = pd.read_csv('lb_plasma_matrix.csv')
tdf = df.T
tdf.columns = tdf.iloc[0] 
tdf = tdf[1:]
#tdf
In [4]:
main_df = pd.concat([meta, tdf], axis=1, join="inner")
#main_df

Using Short Histology¶

In [5]:
short_histology_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Tumor_Subtype', 'Relapse', 'Survival_Status'],axis=1)
short_histology_df
Out[5]:
Short_Histology CTRL_ANT1 CTRL_ANT2 CTRL_ANT3 CTRL_ANT4 CTRL_ANT5 CTRL_miR_POS HK_ACTB HK_B2M HK_GAPDH ... miR-944 miR-95-3p miR-95-5p miR-9-5p miR-96-3p miR-96-5p miR-98-3p miR-99a-5p miR-99b-3p miR-99b-5p
15635-37 LGG 16 16 22 25 5 20225 27 109 6245 ... 23 44 31 19 21 125 38 891 75 290
15635-43 HGG 35 60 49 60 47 51017 30 103 12707 ... 44 83 76 37 37 95 96 1876 78 662
15635-45 HGG 122 139 145 95 127 16810 145 1535 3414 ... 79 179 212 179 113 430 111 1141 102 1034
15635-46 LGG 41 60 99 68 49 58350 95 239 29270 ... 11 99 172 67 35 62 118 1972 143 596
15635-53 HGG 6 20 3 27 5 20365 39 302 6065 ... 13 51 27 7 20 316 26 2028 110 540
15635-60 LGG 12 68 43 102 35 87035 58 199 19675 ... 53 98 127 28 104 20 123 1020 174 370
15635-68 LGG 99 86 36 99 60 39440 79 748 15361 ... 61 190 162 98 75 256 67 1842 168 1263
15635-80 LGG 56 87 97 461 47 7490 77 150 17922 ... 68 82 142 65 55 263 37 377 217 960
15635-87 HGG 76 77 97 88 98 44954 45 262 4624 ... 117 201 202 47 65 297 55 3270 248 896
15635-90 LGG 69 52 76 110 68 20213 67 175 5244 ... 40 84 56 81 73 168 46 3492 155 282
15635-100 LGG 0 17 26 20 18 38373 195 3374 17375 ... 0 57 62 24 22 120 72 3586 73 919
15635-101 LGG 39 30 21 33 23 19047 17 86 8125 ... 39 62 92 39 27 126 33 1508 64 254
15635-127 LGG 16 6 19 73 16 24390 33 304 8055 ... 7 28 50 38 10 113 27 865 44 335
15635-134 LGG 28 50 19 64 4 47361 302 1665 12094 ... 60 147 104 40 31 180 75 2621 154 1349
15635-154 HGG 9 30 3 26 11 22451 5 55 2794 ... 8 92 65 15 52 259 29 985 69 467
15635-156 HGG 45 14 36 15 23 37931 75 837 2349 ... 46 220 67 0 0 307 40 8387 128 1737
15635-239 HGG 24 24 77 92 24 23781 105 223 306 ... 44 110 94 49 27 340 53 2902 189 823

17 rows × 2103 columns


Exploring the data¶

In [6]:
# Store histologies
hist = short_histology_df['Short_Histology'].to_list()

# Plot UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
proj_3d = umap_3d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])

ufig_2d = px.scatter(proj_2d, x=0, y=1, title='UMAP Plot of Short Histology', color=hist)

ufig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)

ufig_2d.show()
#ufig_3d.show()

# Plot t-SNE
tsne_2d = TSNE(n_components=2, init='random', random_state=0, learning_rate='auto')
tsne_3d = TSNE(n_components=3, init='random', random_state=0, learning_rate='auto')

proj_2d = tsne_2d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
proj_3d = tsne_3d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])

tfig_2d = px.scatter(proj_2d, x=0, y=1, title='t-SNE Plot of Short Histology', color=hist)

tfig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)

tfig_2d.show()
#tfig_3d.show()

# Perform PCA
pca2 = PCA(n_components=2)
pca3 = PCA(n_components=3)

df_pca2 = pca2.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
df_pca3 = pca3.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])

# Display PCA plot
pca_2d = px.scatter(df_pca2, x=0, y=1, title='PCA Plot of Short Histology', color=hist)

# Display 3D PCA plot
pca_3d = px.scatter_3d(df_pca3, x=0, y=1, z=2, color=hist)

pca_2d.show()
#pca_3d.show()

In [8]:
# Split the dataset into features and labels
sh_X = short_histology_df.loc[:, short_histology_df.columns != 'Short_Histology'].values
sh_y = short_histology_df.loc[:, short_histology_df.columns == 'Short_Histology'].values.ravel()
In [9]:
# Split data into training and testing set
sh_X_train, sh_X_test, sh_y_train, sh_y_test = train_test_split(sh_X, sh_y, test_size=0.25, random_state=42)

#Sanity check
print(sh_X_train.shape, sh_X_test.shape, sh_y_train.shape, sh_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
In [10]:
# Class Imbalance
fig = px.histogram(short_histology_df, x='Short_Histology', title='Histology Class Balance')
fig.show()
In [11]:
# Initialize random forest classifier
sh_rf = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
sh_rf.fit(sh_X_train, sh_y_train)

# Make predictions using random forest classifier
sh_rf_y_pred = sh_rf.predict(sh_X_test)
In [12]:
# Accuracy of model
print(f'Accuracy: {accuracy_score(sh_y_test, sh_rf_y_pred)}')
Accuracy: 0.6
In [13]:
# Calculate a confusion matrix
sh_cm = confusion_matrix(sh_y_test, sh_rf_y_pred, labels=sh_rf.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(sh_cm, text_auto=True,
                labels=dict(x="True Subtype", y="Predicted Subtype", color="Productivity"),
                x=short_histology_df['Short_Histology'].unique().tolist(),
                y=short_histology_df['Short_Histology'].unique().tolist(),
                title='Histology Classification Accuracy'
                )
disp.show()
In [14]:
# What are the most important features?
sh_rf_feature_list = short_histology_df.columns
sh_rf_feature_list = sh_rf_feature_list.drop('Short_Histology')

sh_rf_imp_features = pd.Series(sh_rf.feature_importances_, index=sh_rf_feature_list)

sh_rf_imp_genes = sh_rf_imp_features.sort_values(ascending=False).to_frame().reset_index()
sh_rf_imp_genes.columns = ["features", "importance"]

sh_rf_imp_genes_fil = sh_rf_imp_genes[~(sh_rf_imp_genes == 0.000000).any(axis=1)]
sh_rf_imp_genes_fil
Out[14]:
features importance
0 miR-4685-3p 0.030000
1 miR-374b-3p 0.030000
2 miR-4727-5p 0.020000
3 miR-6722-5p 0.020000
4 miR-6798-5p 0.020000
... ... ...
94 miR-223-5p 0.002857
95 miR-17-3p 0.002857
96 miR-181b-5p 0.002857
97 miR-6872-3p 0.002857
98 miR-6078 0.002857

99 rows × 2 columns

In [15]:
# Display interactive Barplot of important miRNAs
fig = px.bar(sh_rf_imp_genes_fil, x=sh_rf_imp_genes_fil.features, y=sh_rf_imp_genes_fil.importance)
fig.show()

Relapse¶

In [16]:
relapse_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Short_Histology', 'Tumor_Subtype', 'Survival_Status'],axis=1)
#relapse_df
In [17]:
# Store relapse data
relapse = relapse_df['Relapse'].to_list()

# Plot UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])
proj_3d = umap_3d.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])

ufig_2d = px.scatter(proj_2d, x=0, y=1, title='UMAP Plot of Relapse', color=relapse)

ufig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=relapse)

ufig_2d.show()
#ufig_3d.show()

# Plot t-SNE
tsne_2d = TSNE(n_components=2, init='random', random_state=0, learning_rate='auto')
tsne_3d = TSNE(n_components=3, init='random', random_state=0, learning_rate='auto')

proj_2d = tsne_2d.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])
proj_3d = tsne_3d.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])

tfig_2d = px.scatter(proj_2d, x=0, y=1, title='t-SNE Plot of Relapse', color=relapse)

tfig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=relapse)

tfig_2d.show()
#tfig_3d.show()

# Perform PCA
pca2 = PCA(n_components=2)
pca3 = PCA(n_components=3)

df_pca2 = pca2.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])
df_pca3 = pca3.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])

# Display PCA plot
pca_2d = px.scatter(df_pca2, x=0, y=1, title='PCA Plot of Relapse', color=relapse)

# Display 3D PCA plot
pca_3d = px.scatter_3d(df_pca3, x=0, y=1, z=2, color=relapse)

pca_2d.show()
#pca_3d.show()
In [18]:
# Split the dataset into features and labels
r_X = relapse_df.loc[:, relapse_df.columns != 'Relapse'].values
r_y = relapse_df.loc[:, relapse_df.columns == 'Relapse'].values.ravel()

# Split data into training and testing set
r_X_train, r_X_test, r_y_train, r_y_test = train_test_split(r_X, r_y, test_size=0.25, random_state=42)

#Sanity check
print(r_X_train.shape, r_X_test.shape, r_y_train.shape, r_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
In [19]:
# Class Imbalance
fig = px.histogram(relapse_df, x='Relapse')
fig.show()
In [20]:
# Initialize random forest classifier
r_rf = RandomForestClassifier(max_depth=2, random_state=0)

# Train the random forest classifier
r_rf.fit(r_X_train, r_y_train)

# Make predictions using random forest classifier
r_rf_y_pred = r_rf.predict(r_X_test)
In [21]:
# Accuracy of model
print(f'Accuracy: {accuracy_score(r_y_test, r_rf_y_pred)}')
Accuracy: 0.8
In [22]:
# Calculate a confusion matrix
r_cm = confusion_matrix(r_y_test, r_rf_y_pred, labels=r_rf.classes_)

# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(r_cm, text_auto=True,
                labels=dict(x="True Relapse", y="Predicted Relapse", color="Productivity"),
                x=relapse_df['Relapse'].unique().tolist(),
                y=relapse_df['Relapse'].unique().tolist()
                )
disp.show()
In [38]:
print(r_X_test)
print('True Relapse Label:')
print(r_y_test)
print('Predicted Relapse:')
print(r_rf_y_pred)
[[16 16 22 ... 891 75 290]
 [35 60 49 ... 1876 78 662]
 [12 68 43 ... 1020 174 370]
 [45 14 36 ... 8387 128 1737]
 [39 30 21 ... 1508 64 254]]
True Relapse Label:
['Yes' 'Yes' 'No' 'Yes' 'No']
Predicted Relapse:
['Yes' 'Yes' 'No' 'Yes' 'Yes']

The ML algorithm misclassified patient 15635-101 as having a relapse, but this patient did not at the time of plasma collection. This patient should be investigated further for relapse.

In [23]:
# How is the Random Forest Model Learning?

from sklearn import tree

fn = relapse_df.loc[:, relapse_df.columns != 'Relapse'].columns.to_list()
cn = r_rf.classes_.tolist()
tree.plot_tree(r_rf.estimators_[0], feature_names = fn, class_names=cn, filled = True)
Out[23]:
[Text(0.5, 0.75, 'miR-873-5p <= 1924.0\ngini = 0.278\nsamples = 6\nvalue = [2, 10]\nclass = Yes'),
 Text(0.25, 0.25, 'gini = 0.0\nsamples = 5\nvalue = [0, 10]\nclass = Yes'),
 Text(0.75, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [2, 0]\nclass = No')]
In [24]:
# What are the most important features?
r_rf_feature_list = relapse_df.columns
r_rf_feature_list = r_rf_feature_list.drop('Relapse')

r_rf_imp_features = pd.Series(r_rf.feature_importances_, index=r_rf_feature_list)

r_rf_imp_genes = r_rf_imp_features.sort_values(ascending=False).to_frame().reset_index()
r_rf_imp_genes.columns = ["features", "importance"]

r_rf_imp_genes_fil = r_rf_imp_genes[~(r_rf_imp_genes == 0.000000).any(axis=1)]
r_rf_imp_genes_fil
Out[24]:
features importance
0 miR-6506-3p 0.030303
1 miR-502-5p 0.020202
2 miR-4677-5p 0.020202
3 miR-19a-5p 0.020202
4 miR-887-5p 0.020202
... ... ...
91 miR-7106-3p 0.006734
92 miR-1976 0.006734
93 miR-7157-3p 0.003367
94 miR-4636 0.003367
95 miR-505-3p 0.002886

96 rows × 2 columns

In [25]:
# Display interactive Barplot of important miRNAs
fig = px.bar(r_rf_imp_genes_fil, x=r_rf_imp_genes_fil.features, y=r_rf_imp_genes_fil.importance)
fig.show()

How good are the ML found genes for histology classification?¶

In [26]:
# Create a dataframe containing only ML selected genes
ml_hist_df = short_histology_df.filter(sh_rf_imp_genes_fil['features'].to_list()[0:1])

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(ml_hist_df)
proj_3d = umap_3d.fit_transform(ml_hist_df)

fig_2d = px.scatter(proj_2d, x=0, y=1, color=hist, title='UMAP Plot using genes found by Random Forest Algorithm')

fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)

fig_2d.show()
fig_3d.show()
In [27]:
sh_rf_imp_genes_fil['features'].to_list()[0:1]
Out[27]:
['miR-4685-3p']

How good are the ML found genes for relapse prediction?¶

In [29]:
# Create a dataframe containing only ML selected genes
ml_relapse_df = relapse_df.filter(r_rf_imp_genes_fil['features'].to_list())

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(ml_relapse_df)
proj_3d = umap_3d.fit_transform(ml_relapse_df)

fig_2d = px.scatter(proj_2d, x=0, y=1, color=relapse, title='UMAP Plot for using genes found by Random Forest Algorithm (Relapse)')

fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=relapse)

fig_2d.show()
fig_3d.show()
In [30]:
# Create a dataframe containing only ML selected genes
ml_relapse_df = relapse_df.filter(r_rf_imp_genes_fil['features'].to_list()[0:1]) # Also [0:10]

umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)

proj_2d = umap_2d.fit_transform(ml_relapse_df)
proj_3d = umap_3d.fit_transform(ml_relapse_df)

fig_2d = px.scatter(proj_2d, x=0, y=1, color=relapse, title='UMAP Plot using most important gene found by Random Forest Algorithm (Relapse)')

fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=relapse)

fig_2d.show()
fig_3d.show()
In [31]:
# The most important gene determined by ML algorithm
r_rf_imp_genes_fil['features'].to_list()[0:1]
Out[31]:
['miR-6506-3p']

Interestingly, patient 15635-101 is being clustered with the relapsed patients based on just miRNA signature.


Export list of genes¶

In [32]:
# Add "hsa-" to all miRNA genes
hist_genes = sh_rf_imp_genes_fil['features'].to_list()
relapse_genes = r_rf_imp_genes_fil['features'].to_list()

hist_genes = ["hsa-" + mirnas for mirnas in hist_genes]
relapse_genes = ["hsa-" + mirnas for mirnas in relapse_genes]
In [33]:
# Export miRNA genes important for distinguishing LGG and HGG brain tumors
with open("LGG_miRNAs.txt", 'w') as file:
        for row in hist_genes:
            s = "".join(map(str, row))
            file.write(s+'\n')

# Export miRNA genes important for relapse in LGG/HGG brain tumors
with open("LGG_relapse_miRNAs.txt", 'w') as file:
        for row in relapse_genes:
            s = "".join(map(str, row))
            file.write(s+'\n')

Comparing with Shiny App results¶

In [ ]:
dip_test_genes = ['miR−339−5p', 'miR−1299', 'let−7g−5p', 'miR−145−3p', 'miR−6752−5p', 'miR−708−5p', 'miR−3679−3p',
                  'miR−448', 'miR−548ag', 'miR−4796−5p', 'miR−6844', 'miR−5697', 'miR−4765', 'miR−8076', 'miR−1278', 
                  'miR−3606−5p', 'miR−4424', 'miR−2114−3p', 'miR−4677−5p', 'miR−644a ', 'miR−6739−3p']
In [ ]:
relapse_df[[dip_test_genes]]

Next Steps:¶

  • Check if the miRNA genes found are differentially expressed between LGG and HGG patients, and between relapse and non-relapse patients.